library(GEOquery)
library(crayon)
library(tidyverse)
library(Seurat)
library(SeuratData)
library(Azimuth)
library(patchwork)
library(Matrix)
library(RCurl)
library(DoubletFinder)
library(scales)
library(SoupX)
library(cowplot)
library(metap)
library(SingleCellExperiment)
library(DropletUtils)
library(AnnotationHub)
library(HGNChelper)
library(ensembldb)
library(multtest)
library(glmGamPoi)
library(pbapply)
library(data.table)
library(ggrepel)
library(ggpubr)
library(stringr)
library(canvasXpress)
library(clinfun)
library(GGally)
library(factoextra)
library(DESeq2)
library(limma)
library(fgsea)
library(org.Mm.eg.db)
library(kableExtra)

Step 1: Data Exploration and Import

Step 1.1: Read in all sample matrix and create Seurat object

# Specify path to supplementary files. 
supp_file_path = "/Users/kendrix/Documents/Work/UnityHealth/Data_Projects/scRNAseq_LungFibrosisAnalysis/GSE264278/"

# Get unique sample names.
sample_names = list.files(path = paste0(supp_file_path, "raw_data"), 
                          pattern = "GSM", 
                          full.names = FALSE) %>% grep("matrix", ., value = TRUE)
sample_names
## [1] "GSM8215373_matrix_inflection_demulti_day00UT.txt"     
## [2] "GSM8215375_matrix_inflection_demulti_day03BLM.txt"    
## [3] "GSM8215377_matrix_inflection_demulti_day07BLM.txt"    
## [4] "GSM8215379_matrix_inflection_demulti_day14BLM.txt"    
## [5] "GSM8215381_matrix_inflection_demulti_day28BLM.txt"    
## [6] "GSM8215383_matrix_inflection_demulti_day42BLM.txt"    
## [7] "GSM8215385_matrix_inflection_demulti_day63BLM.txt"    
## [8] "GSM8215387_matrix_inflection_demulti_day7day14LPS.txt"
## [9] "GSM8215389_matrix_inflection_demulti_day42LPS.txt"
# Create Seurat object for each sample.
for (file in sample_names){
  seurat_data <- read.table(file = paste0(supp_file_path, "raw_data/", file), 
                            header = TRUE,
                            row.names = 1)
  seurat_obj <- CreateSeuratObject(counts = seurat_data,
                                   min.features = 100,
                                   project = file)
  assign(file, seurat_obj)
}
## Warning: Data is of class data.frame. Coercing to dgCMatrix.
## Warning: Data is of class data.frame. Coercing to dgCMatrix.
## Warning: Data is of class data.frame. Coercing to dgCMatrix.
## Warning: Data is of class data.frame. Coercing to dgCMatrix.
## Warning: Data is of class data.frame. Coercing to dgCMatrix.
## Warning: Data is of class data.frame. Coercing to dgCMatrix.
## Warning: Data is of class data.frame. Coercing to dgCMatrix.
## Warning: Data is of class data.frame. Coercing to dgCMatrix.
## Warning: Data is of class data.frame. Coercing to dgCMatrix.
# Remove unneeded objects from the environment.
rm(file, seurat_data, seurat_obj)

Step 1.2: Merge Seurat objects

# Merge Seurat objects.
merged_seurat <- merge(x = GSM8215373_matrix_inflection_demulti_day00UT.txt,
                       y = c(GSM8215375_matrix_inflection_demulti_day03BLM.txt,
                             GSM8215377_matrix_inflection_demulti_day07BLM.txt,
                             GSM8215379_matrix_inflection_demulti_day14BLM.txt,
                             GSM8215381_matrix_inflection_demulti_day28BLM.txt,
                             GSM8215383_matrix_inflection_demulti_day42BLM.txt,
                             GSM8215385_matrix_inflection_demulti_day63BLM.txt),
                       add.cell.id = c("Lung_Control_Day0",
                                       "Lung_Bleomycin_Day3",
                                       "Lung_Bleomycin_Day7",
                                       "Lung_Bleomycin_Day14",
                                       "Lung_Bleomycin_Day28",
                                       "Lung_Bleomycin_Day42",
                                       "Lung_Bleomycin_Day63"))

# Concatenate the count matrices of the samples together.
merged_seurat <- JoinLayers(merged_seurat)

# Remove individual Seurat objects after successful merge.
rm(list = ls(pattern = "GSM"))
gc()
##             used   (Mb) gc trigger    (Mb) limit (Mb)   max used    (Mb)
## Ncells  15874709  847.9   25441184  1358.8         NA   25441184  1358.8
## Vcells 868017648 6622.5 4283986572 32684.3    1024000 5347863597 40801.0

Step 1.3: Read metadata

# Pull GEO metadata.
GSE264278_meta <- getGEO(GEO = "GSE264278", GSEMatrix = TRUE)
## Found 1 file(s)
## GSE264278_series_matrix.txt.gz
GSE264278_meta <- pData(GSE264278_meta$GSE264278_series_matrix.txt.gz)

# Let's filter out the SampleTags because they are not informative for our analysis.
GSE264278_meta <- GSE264278_meta[GSE264278_meta$molecule_ch1 == "polyA RNA",]

# Subset metadata.
GSE264278_meta <- GSE264278_meta %>% subset(select = c(geo_accession,
                                                       title,
                                                       organism_ch1,
                                                       `strain:ch1`,
                                                       `treatment:ch1`,
                                                       molecule_ch1))

# Rename columns to more accessible names.
colnames(GSE264278_meta) <- c("geo_accession",
                              "sample_name",
                              "organism",
                              "strain",
                              "treatment",
                              "rna_type")

# Add tissue_type column.
GSE264278_meta$tissue_type <- "whole lung"

# Add timepoint column and edit.
GSE264278_meta$timepoint <- GSE264278_meta$sample_name
GSE264278_meta$timepoint <- gsub("_cDNA|BLM|LPS", "", GSE264278_meta$timepoint)
GSE264278_meta$timepoint <- gsub("day", "Day ", GSE264278_meta$timepoint)
GSE264278_meta$timepoint <- gsub("Day 7Day 14", "Day 7/Day 14", GSE264278_meta$timepoint)

# Edit Seurat metadata.
# Add geo_accession column:
merged_seurat$orig.ident <- sub("not.detected", "not_detected", merged_seurat$orig.ident)
merged_seurat$geo_accession <- sub("^(.*)[.].*", "\\1", merged_seurat$orig.ident)

# Add replicate column:
merged_seurat$replicate <- sub(".*\\.", "", merged_seurat$orig.ident)

# Add and edit treatment_dosage column:
merged_seurat$treatment_dosage <- NA
merged_seurat$treatment_dosage[merged_seurat$geo_accession %in% c("day03BLM3mg",
                                                                  "day07BLM3mg",
                                                                  "day14BLM3mg")] <- "3mg"
merged_seurat$treatment_dosage[merged_seurat$geo_accession %in% c("day03BLM1.25mg",
                                                                  "day07BLM1.25mg",
                                                                  "day14BLM1.25mg",
                                                                  "day28BLM1.25mg",
                                                                  "day42BLM1.25mg",
                                                                  "day63BLM1.25mg")] <- "1.25mg"
merged_seurat$treatment_dosage[merged_seurat$geo_accession == "day00UT"] <- "Untreated"
merged_seurat$treatment_dosage[merged_seurat$geo_accession == "doublet"] <- "Doublet"
merged_seurat$treatment_dosage[merged_seurat$geo_accession == "not_detected"] <- "Not detected"

# Edit geo_accession column:
merged_seurat$geo_accession <- gsub("1.25mg|3mg", "", merged_seurat$geo_accession)
merged_seurat$geo_accession[merged_seurat$geo_accession == "day00UT"] <- "GSM8215373"
merged_seurat$geo_accession[merged_seurat$geo_accession == "day03BLM"] <- "GSM8215375"
merged_seurat$geo_accession[merged_seurat$geo_accession == "day07BLM"] <- "GSM8215377"
merged_seurat$geo_accession[merged_seurat$geo_accession == "day14BLM"] <- "GSM8215379"
merged_seurat$geo_accession[merged_seurat$geo_accession == "day28BLM"] <- "GSM8215381"
merged_seurat$geo_accession[merged_seurat$geo_accession == "day42BLM"] <- "GSM8215383"
merged_seurat$geo_accession[merged_seurat$geo_accession == "day63BLM"] <- "GSM8215385"

# Add quality metrics to the metadata. 
# Count total percentage of mitochondrial genes.
merged_seurat <- PercentageFeatureSet(merged_seurat, 
                                      pattern = "^mt-", 
                                      col.name = "percent.mito")

# Count total percentage of ribosomal genes.
merged_seurat <- PercentageFeatureSet(merged_seurat, 
                                      pattern = "^rp[sl]",
                                      col.name = "percent.ribo")

# Count total haemoglobin genes (but not HBP).
merged_seurat <- PercentageFeatureSet(merged_seurat,
                                      pattern = "^hb[^(p)]",
                                      col.name = "percent.globin")

# In the metadata, there are rows that are labelled as "Doublet" and "Not detected". Let's see what those are.
dim(merged_seurat) # 151,057 cells before filtering.
## [1]  36133 151057
head(merged_seurat@meta.data[merged_seurat$geo_accession %in% c("doublet", "not_detected"),])
##                                      orig.ident nCount_RNA nFeature_RNA
## Lung_Control_Day0_doublet_UTWTA_115     doublet      62576         3190
## Lung_Control_Day0_doublet_UTWTA_201     doublet     164511         5296
## Lung_Control_Day0_doublet_UTWTA_415     doublet     126449         4813
## Lung_Control_Day0_doublet_UTWTA_587     doublet     103500         4326
## Lung_Control_Day0_doublet_UTWTA_1503    doublet      75195         4018
## Lung_Control_Day0_doublet_UTWTA_1587    doublet     137594         5053
##                                      geo_accession replicate treatment_dosage
## Lung_Control_Day0_doublet_UTWTA_115        doublet   doublet          Doublet
## Lung_Control_Day0_doublet_UTWTA_201        doublet   doublet          Doublet
## Lung_Control_Day0_doublet_UTWTA_415        doublet   doublet          Doublet
## Lung_Control_Day0_doublet_UTWTA_587        doublet   doublet          Doublet
## Lung_Control_Day0_doublet_UTWTA_1503       doublet   doublet          Doublet
## Lung_Control_Day0_doublet_UTWTA_1587       doublet   doublet          Doublet
##                                      percent.mito percent.ribo percent.globin
## Lung_Control_Day0_doublet_UTWTA_115      5.361480            0              0
## Lung_Control_Day0_doublet_UTWTA_201      3.937731            0              0
## Lung_Control_Day0_doublet_UTWTA_415      4.124983            0              0
## Lung_Control_Day0_doublet_UTWTA_587      6.238647            0              0
## Lung_Control_Day0_doublet_UTWTA_1503     3.702374            0              0
## Lung_Control_Day0_doublet_UTWTA_1587     4.043781            0              0
dim(merged_seurat@meta.data[merged_seurat$geo_accession %in% c("doublet", "not_detected"),]) # 14,538 cells either labelled doublet or not detected, so they are removed.
## [1] 14538     9
merged_seurat <- subset(merged_seurat, 
                        subset = geo_accession %in% c("doublet", "not_detected"),
                        invert = TRUE)
dim(merged_seurat) # 136,519 cells after doublets and undetected cells are removed.
## [1]  36133 136519

Step 1.4: Add metadata variables from GEO

We can merged the metadata from GEO and put the merged metadata back into the Seurat object.

# Get metadata from Seurat object.
metadata = merged_seurat@meta.data %>% rownames_to_column()
dim(metadata) # 136,519 rows, 10 columns.
## [1] 136519     10
# Merge metadata with GEO metadata by the newly created geo_accession column.
metadata <- merge(metadata, GSE264278_meta, by = "geo_accession")
dim(metadata) # 136,519 rows, 17 columns. 
## [1] 136519     17
# Edit and retain only required columns.
metadata <- metadata %>% column_to_rownames() 

# Generate quality metrics.
names(metadata)[names(metadata) == "nCount_RNA"] <- "nUMI"
names(metadata)[names(metadata) == "nFeature_RNA"] <- "nGene"

# Compute mitochondrial ratio.
metadata$mitoRatio = PercentageFeatureSet(object = merged_seurat, pattern = "^mt-")
metadata$mitoRatio = metadata$mitoRatio / 100

# Calculate novelty score.
metadata$log10GenesPerUMI <- log10(metadata$nGene) / log10(metadata$nUMI)

# Add the new metadata back to the Seurat object. 
merged_seurat@meta.data <- metadata

# Remove unneeded objects.
rm(GSE264278_meta, metadata, sample_names)

Step 2: Quality Control

Step 2.1: Visualize cell counts

Check cell counts with number of unique barcodes detected to determine capture efficiency.

# Extract metadata from Seurat object.
metadata = merged_seurat@meta.data

# Visualize cell counts.
metadata %>%
  ggplot(aes(x = timepoint, fill = timepoint)) +
  geom_bar() +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  ggtitle("NCells")

Step 2.2: Visualize UMI and gene counts per cell

The UMI counts per cell should generally be above 500, that is the low end of what we expect. If UMI counts are between 500-1000 counts, it is usable but the cells probably should have been sequenced more deeply.

# Visualize UMI counts per cell.
metadata %>% 
  ggplot(aes(x = nUMI, color = timepoint, fill = timepoint)) +
  geom_density(alpha = 0.2) +
  scale_x_log10(labels = label_number()) +
  ylab("Cell density") +
  geom_vline(xintercept = 500, linetype = "dashed", color = "red") +
  theme_classic()

# Compare UMI counts and gene counts per cell. 
Idents(merged_seurat) <- "geo_accession"
VlnPlot(merged_seurat, features = c("nUMI", "nGene"), alpha = 0.2, raster = FALSE)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

Step 2.3: Visualize novelty score of cells

Generally, we expect the novelty score to be above 0.60 for good quality cells.

# Visualize novelty score.
metadata %>% 
  ggplot(aes(x = log10GenesPerUMI, color = timepoint, fill = timepoint)) +
  geom_density(alpha = 0.2) +
  geom_vline(xintercept = 0.6, linetype = "dashed", color = "red") +
  theme_classic()

Step 2.4: Visualize poor cell quality based on mitochondrial, ribosomal and haemoglobin ratio

In general, poor quality samples have mitochondrial and ribosomal ratio higher than 0.2 mark.

# Visualize mitochondrial ratio.
metadata %>%
  ggplot(aes(x = mitoRatio, color = timepoint, fill = timepoint)) +
  geom_density(alpha = 0.2) +
  scale_x_log10() +
  geom_vline(xintercept = 0.2, linetype = "dashed", color = "red") +
  theme_classic()

We can also use Seurat’s native function to visualize mito, ribo and haemoglobin genes in the dataset.

# Visualize all three metrics.
VlnPlot(merged_seurat, features = c("percent.mito", "percent.ribo", "percent.globin"), raster = FALSE)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE], idents =
## idents, : All cells have the same value of percent.ribo.
## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE], idents =
## idents, : All cells have the same value of percent.globin.

Step 2.5: Visualize joint filtering effects

Considering any of these QC metrics in isolation can lead to misinterpretation of cellular signals. Jointly visualizing the count and gene thresholds and additionally overlaying the mitochondrial fraction, gives a summarized persepective of the quality per cell.

# Visualize joint filtering QCs.
metadata %>% 
    ggplot(aes(x = nUMI, y = nGene, color = mitoRatio)) + 
    geom_point() + 
    scale_colour_gradient(low = "gray90", high = "black") +
    stat_smooth(method = lm) +
    scale_x_log10() + 
    scale_y_log10() + 
    geom_vline(xintercept = 500) +
    geom_hline(yintercept = 250) +
    theme_classic() +
    facet_wrap(~timepoint)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Step 2.6: Visualize valid cells with knee plot

# Rank expression barcodes. 
br.out = barcodeRanks(merged_seurat@assays$RNA$counts)

# Plot knee plot.
plot(br.out$rank, br.out$total, log = "xy", xlab = "Rank", ylab = "Total")
abline(h = metadata(br.out)$knee, col = "dodgerblue", lty = 2)
abline(h = metadata(br.out)$inflection, col = "forestgreen", lty = 2)
legend("bottomleft", lty = 2, col = c("dodgerblue", "forestgreen"), legend = c("knee", "inflection"))

# Remove unneeded object.
rm(br.out)

Step 2.7: Filtering

Step 2.7.1: Cell-level filtering

Now that we have visualized the various metrics, we can decide on the thresholds to apply which will result in the removal of low quality cells. We will use the following thresholds:

  • nUMI > 500

  • nGene > 250

  • log10GenesPerUMI > 0.6

  • mitoRatio < 0.2

# Get pre-filtering cell counts.
dim(merged_seurat)[2] # 136,519 cells
## [1] 136519
# Low-quality cell filtering.
umi_filtered <- subset(x = merged_seurat, subset = nUMI >= 500)
gene_filtered <- subset(x = umi_filtered, subset = nGene >= 250)
genesperumi_filtered <- subset(x = gene_filtered, subset = log10GenesPerUMI > 0.6)
mitoratio_filtered <- subset(x = genesperumi_filtered, subset = mitoRatio < 0.20)

# Get post-filtering Seurat object. 
# Enter level of filtering desired here.
cell_filtered_seurat = mitoratio_filtered

# Get counts after low-quality cell filtering.
dim(cell_filtered_seurat)[2] # 133,367 cells.
## [1] 133367

We can visualize the amount of cells removed and the number of cells remaining for each filtering step in the plot below.

Step 2.7.2: Gene-level filtering

We also remove genes that have zero expression in all cells and genes that are only expressed in 10 or less cells as genes that are expressed in only a handful of cells are not meaningful and can bring down the averages for all the other cells that they are not expressed in.

# Extract count matrix.
counts <- GetAssayData(object = cell_filtered_seurat, layer = "counts")

# Get a logical vector for every gene that has more than zero counts per cell.
nonzero <- counts > 0

# Get a logical vector of genes that are expressed in 10 or more cells.
keep_genes <- Matrix::rowSums(nonzero) >= 10

# Subset counts to keep genes that are non-zero and expressed in 10 or more cells.
filtered_counts <- counts[keep_genes, ]

# Create new Seurat object with the subsetted counts.
filtered_seurat <- CreateSeuratObject(filtered_counts, meta.data = cell_filtered_seurat@meta.data)

We can visualize the amount of genes removed and the number of genes remaining for each filtering step in the plot below.

Step 3: Normalization

Step 3.1: Pre-normalization visualization

Let’s have a look of our expression distribution before normalization.

# Visualize expression distribution before normalization.
hist(colSums(filtered_seurat[["RNA"]]$counts), 
     breaks = 100, 
     col = "#212226",
     main = "Pre-Normalization: Expression distribution", 
     xlab = "Sum of expression")

Step 3.2: Simple normalization

Before we make any comparisons across cells, we will apply a simple normalization. This is solely for the purpose of exploring the sources of variation in our data.

# Apply simple normalization to merged Seurat objects.
seurat_phase <- NormalizeData(filtered_seurat)
## Normalizing layer: counts
# Visualize expression distribution after simple normalization.
hist(colSums(seurat_phase[["RNA"]]$data), 
     breaks = 100, 
     col = "#b8520a",
     main = "Post-Simple Normalization: Expression distribution", 
     xlab = "Sum of expression")

# Remove unneeded objects.
rm(filtered_seurat)

Step 3.3: Evaluate effects of cell cycle

Step 3.3.1: Set up cell cycle scores

# Download cell cycle genes for organism at https://github.com/hbc/tinyatlas/tree/master/cell_cycle. 
cc_file <- getURL("https://raw.githubusercontent.com/hbc/tinyatlas/master/cell_cycle/Mus_musculus.csv") 
cell_cycle_genes <- read.csv(text = cc_file)

# Remove unneeded objects.
rm(cc_file)

All of the cell cycle genes are Ensembl IDs, but our gene IDs are the gene names. To score the genes in our count matrix for cell cycle, we need to obtain the gene names for the cell cycle genes.

# Connect to AnnotationHub.
ah <- AnnotationHub()

# Access the Ensembl database for organism.
ahDb <- query(ah, 
              pattern = c("Mus musculus", "EnsDb"), 
              ignore.case = TRUE)

# Acquire the latest annotation files.
id <- ahDb %>%
        mcols() %>%
        rownames() %>%
        tail(n = 1)

# Download the appropriate Ensembldb database.
edb <- ah[[id]]
## loading from cache
# Extract gene-level information from database.
annotations <- genes(edb, 
                     return.type = "data.frame")

# Select annotations of interest.
annotations <- annotations %>%
        dplyr::select(gene_id, gene_name, seq_name, gene_biotype, description)

# Remove unneeded objects.
rm(ah, ahDb, edb, id)

We can merge the Ensembl genes to the cell cycle genes.

# Get gene names for Ensembl IDs for each gene.
cell_cycle_markers <- dplyr::left_join(cell_cycle_genes, annotations, by = c("geneID" = "gene_id"))

# Acquire the S phase genes.
s_genes <- cell_cycle_markers %>%
        dplyr::filter(phase == "S") %>%
        pull("gene_name")
        
# Acquire the G2M phase genes.        
g2m_genes <- cell_cycle_markers %>%
        dplyr::filter(phase == "G2/M") %>%
        pull("gene_name")

# Remove unneeded objects.
rm(cell_cycle_genes, cell_cycle_markers, annotations)

We can then use the annotated cell cycle genes to perform cell cycle scoring.

# Perform cell cycle scoring
seurat_phase <- CellCycleScoring(seurat_phase,
                                 g2m.features = g2m_genes,
                                 s.features = s_genes)

# Remove unneeded objects.
rm(g2m_genes, s_genes)

Step 3.3.2: Use PCA to evaluate effects of cell cycle

# Identify the most variable genes.
seurat_phase <- FindVariableFeatures(seurat_phase, 
                                     selection.method = "vst",
                                     nfeatures = 2000, 
                                     verbose = FALSE)
             
# Scale the counts.
seurat_phase <- ScaleData(seurat_phase)
## Centering and scaling data matrix
# Visualize expression distribution after scaling.
hist(colSums(seurat_phase[["RNA"]]$scale.data), 
     breaks = 100, 
     col = "#779e7f",
     main = "Post-Scaling: Expression distribution", 
     xlab = "Sum of expression")

# Identify the 15 most highly variable genes.
ranked_variable_genes <- VariableFeatures(seurat_phase)
top_genes <- ranked_variable_genes[1:15]

# Plot the average expression and variance of these genes with labels to indicate which genes are in the top 15.
p <- VariableFeaturePlot(seurat_phase)
LabelPoints(plot = p, points = top_genes, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results

# Remove unneeded objects.
rm(ranked_variable_genes, top_genes, p)
# Perform PCA.
seurat_phase <- RunPCA(seurat_phase)
## PC_ 1 
## Positive:  Tmem212, Dynlrb2, Ccdc153, Foxj1, Fam183b, Rsph1, Cyp2s1, Cdhr3, 1700016K19Rik, Cfap126 
##     1700007K13Rik, BC051019, Gm19935, Ak7, Nme5, Ccdc113, Nme9, Cdhr4, Cfap206, Sntn 
##     Rsph4a, Dnah5, Pifo, Tspan1, Riiad1, Ccdc39, Cfap53, Dnah6, Dnali1, Tctex1d4 
## Negative:  Ctss, Clec7a, Fn1, Ighm, Rgs1, Ifi27l2a, Gm12840, Ccr7, Apoe, Tgfbi 
##     Cd68, Ccl6, Acp5, Lpl, Igkc, Igfbp4, Wfdc17, Itgam, Clec4n, Il1b 
##     Fgl2, Mrc1, Ccr2, Mmp19, Msr1, Ccl9, Spp1, Osm, Cfh, Abcg1 
## PC_ 2 
## Positive:  Ctss, Clec7a, Ccl6, Cd68, Rgs1, Napsa, Ighm, Abcg1, Lyz1, Acp5 
##     Clec4n, Ccr7, Cd24a, Wfdc17, Lgals3, Il1b, Itgam, Igkc, Ear2, Mrc1 
##     Chil3, Osm, Tgfbi, Ccl9, F7, Il1rn, Atp6v0d2, Ccr2, Msr1, Fabp5 
## Negative:  Col1a2, Bgn, Col3a1, Serping1, Col1a1, Mgp, C1s1, Col6a2, Fstl1, Mmp2 
##     Rarres2, Fbn1, Ogn, Col5a2, Nbl1, Dpt, Igfbp7, Cfh, Gpc3, C1ra 
##     Mfap4, Mfap5, Fbln1, Eln, Gpx3, Adh1, Adamts2, Col6a3, Lox, Cp 
## PC_ 3 
## Positive:  Aqp5, Sftpd, Cldn18, Ager, Irx1, Lamp3, Slc34a2, Cxcl15, Sftpb, Sfta2 
##     Sftpa1, Ptprf, Muc1, Lgi3, Egfl6, Chil1, Hc, Slc39a8, Sftpc, S100g 
##     Mal2, Wfdc2, Kcnj15, Gprc5a, Slco4c1, Clic3, Lrp2, Sdc1, Ppp1r14c, Cystm1 
## Negative:  Ctss, Clec7a, Ccl6, Cd68, Clec4n, Mrc1, Tgfbi, Fn1, Wfdc17, Ear2 
##     Ctsk, Abcg1, Mmp19, Spp1, Acp5, Chil3, Lpl, Msr1, Osm, F7 
##     Trem2, Ccl9, Atp6v0d2, Ccdc153, Cdhr3, Tmem212, Il18, Il1b, Fam183b, Rgs1 
## PC_ 4 
## Positive:  Ccl6, Cd68, Lyz1, Clec7a, Lgals3, Clec4n, Mrc1, F7, Tgfbi, Atp6v0d2 
##     Ctsd, Ctsb, Chil3, Ear2, Trem2, Ctss, Fabp5, Msr1, Il18, Ltc4s 
##     Spp1, Mmp19, Ms4a8a, Plet1, Olr1, Wfdc17, Osm, Pld3, Wfdc21, Ctsk 
## Negative:  Ighm, Igkc, Ccr7, Kdr, Lyve1, Clic5, Iglc2, Iglc3, Pcp4l1, Tmem252 
##     Apold1, Pdzd2, Nkg7, Ackr3, Icos, Mzb1, Gm50318, Vwf, Cxcl12, Iigp1 
##     Ccl5, Gucy1b1, Postn, Prf1, Clca3a1, Cyp26b1, Pde5a, Gucy1a1, Edn1, Enpp2 
## PC_ 5 
## Positive:  Chil1, Slc34a2, Lamp3, Cxcl15, Hc, Sftpa1, Egfl6, S100g, Sftpb, Sfta2 
##     Slco4c1, Ppp1r14c, Il33, Acoxl, Lcn2, Sftpc, Lgi3, Etv5, Lrp2, Sftpd 
##     Muc1, Car8, Scd1, Kcnj15, Aox3, Fasn, Dram1, Napsa, Cbr2, Hp 
## Negative:  Col4a3, Rtkn2, Akap5, Col4a4, Spock2, Lama3, Sema3e, Scnn1g, Ano1, Krt7 
##     Itgb6, Lmo7, Fam174b, Ndnf, Cryab, Hopx, Mal2, Scnn1b, Lamc2, Cyp2b10 
##     Gprc5a, Clic5, Pdpn, Myh14, Sec14l3, Akr1c14, Clic3, Krt19, Flrt3, Ager
# Plot the PCA colored by cell cycle phase.
DimPlot(seurat_phase,
        reduction = "pca",
        group.by = "Phase",
        split.by = "Phase",
        raster = FALSE)

# Split effects of PCA by agedness
DimPlot(seurat_phase,
        reduction = "pca",
        group.by = "Phase",
        split.by = "treatment",
        raster = FALSE)

We don’t see large differences due to the effects of cell cycle, so cell cycle effects are not regressed out.

Step 3.4: SCTransform normalization

# Free up memory.
gc()
##              used    (Mb) gc trigger    (Mb) limit (Mb)   max used    (Mb)
## Ncells   16404146   876.1   25441184  1358.8         NA   25441184  1358.8
## Vcells 1769324545 13498.9 4234867266 32309.5    1024000 5347863597 40801.0
# Set memory limit.
options(future.globals.maxSize = 5000 * 1024^4)

# Perform SCTransform normalization.  
seurat_sct <- SCTransform(seurat_phase, 
                          vars.to.regress = c("mitoRatio"),
                          vst.flavor = "v2",
                          conserve.memory = TRUE,
                          verbose = FALSE)
## Will not return corrected UMI because residual type is not set to 'pearson'
# Compare the objects before and after SCT transformation. 
seurat_phase # Look at the feature counts before transformation.
## An object of class Seurat 
## 34407 features across 133367 samples within 1 assay 
## Active assay: RNA (34407 features, 2000 variable features)
##  3 layers present: counts, data, scale.data
##  1 dimensional reduction calculated: pca
seurat_sct # Look at the feature counts after transformation.
## An object of class Seurat 
## 68814 features across 133367 samples within 2 assays 
## Active assay: SCT (34407 features, 3000 variable features)
##  3 layers present: counts, data, scale.data
##  1 other assay present: RNA
##  1 dimensional reduction calculated: pca
# Check which objects are stored in which assays. 
seurat_sct@assays
## $RNA
## Assay (v5) data with 34407 features for 133367 cells
## Top 10 variable features:
##  Hbb-bs, Hba-a1, Hba-a2, Hbb-bt, Scgb3a1, Scgb3a2, Retnla, Jchain,
## Bpifa1, Bpifb1 
## Layers:
##  counts, data, scale.data 
## 
## $SCT
## SCTAssay data with 34407 features for 133367 cells, and 1 SCTModel(s) 
## Top 10 variable features:
##  Sftpc, S100a9, S100a8, Ccl21a, Scgb1a1, Gzma, Ccl21b, Sftpa1, Retnlg,
## Acta2
# Remove unneeded objects.
rm(seurat_phase)

The SCTransform normalized counts are stored in the SCT$counts slot. Here is the distribution of the SCTransform normalized counts distribution.

# Visualize expression distribution after scaling.
hist(colSums(seurat_sct[["SCT"]]$counts), 
     breaks = 100, 
     col = "#332f42",
     main = "Post-SCTransform: Expression distribution", 
     xlab = "Sum of expression")

The log-normalized SCTransfromed counts are also stored in the SCT$data slot. Here is the distribution of the log-normalized SCTransformed counts.

hist(colSums(seurat_sct[["SCT"]]$data), 
     breaks = 100, 
     col = "#b14b34",
     main = "Post-SCTransformed: Log-normalized expression distribution", 
     xlab = "Sum of expression")

Step 4: Dimensional Reduction and Clustering

The goal of this step is to perform unsupervised clustering to identify groups of cells based on similarities of their transcriptomes without any prior knowledge of the labels or the number of clusters. This can be challenging due to the high level of noise and large number of dimensions, so it is often beneficial to apply a dimensionality reduction method to reduce the amount of noise.

Step 4.1: Run PCA

# Run PCA.
seurat_sct <- RunPCA(seurat_sct, verbose = FALSE)

# Visualize dimension reduction from PCA.
seurat_sct <- RunUMAP(seurat_sct, dims = 1:40, reduction = "pca")
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 10:33:56 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:33:56 Read 133367 rows and found 40 numeric columns
## 10:33:56 Using Annoy for neighbor search, n_neighbors = 30
## 10:33:56 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:34:13 Writing NN index file to temp file /var/folders/1c/sbnd3yvd7mzg5lng1jcr70tw0000gn/T//RtmpclL6CG/file11cb24e1ba059
## 10:34:13 Searching Annoy index using 1 thread, search_k = 3000
## 10:35:07 Annoy recall = 100%
## 10:35:09 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 10:35:13 Initializing from normalized Laplacian + noise (using RSpectra)
## 10:35:29 Commencing optimization for 200 epochs, with 5933352 positive edges
## 10:36:26 Optimization finished
DimPlot(object = seurat_sct, group.by = "timepoint", raster = FALSE)

Step 4.2: Top PC-driving metagenes

# Explore the top PC markers.
DimHeatmap(seurat_sct, 
           dims = 1:9, 
           cells = 500, 
           balanced = TRUE)

We can visualize an elbow plot to see how many PCs to use for clustering.

# Build the elbow plot.
# Determine percent of variation associated with each PC.
pct <- seurat_sct[["pca"]]@stdev / sum(seurat_sct[["pca"]]@stdev) * 100

# Calculate cumulative percents for each PC.
cumu <- cumsum(pct)

# Determine which PC exhibits cumulative percent greater than 90% and % variation associated with the PC as less than 5.
co1 <- which(cumu > 90 & pct < 5)[1]
co1
## [1] 40
# Determine the difference between variation of PC and subsequent PC.
co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1

# Last point where change of % of variation is more than 0.1%.
co2
## [1] 21
# Minimum of the two calculation.
pcs <- min(co1, co2)
pcs
## [1] 21
# Create a dataframe with values.
elbowplot_df <- data.frame(pct = pct, 
                           cumu = cumu, 
                           rank = 1:length(pct))

# Elbow plot to visualize.
ggplot(elbowplot_df, aes(cumu, pct, label = rank, color = rank > pcs)) + 
  geom_text() + 
  geom_vline(xintercept = 90, color = "grey") + 
  geom_hline(yintercept = min(pct[pct > 5]), color = "grey") +
  theme_bw()

# Remove unneeded objects.
rm(co1, co2, cumu, elbowplot_df, pcs, pct)

Based on the elbow plot, we will use approx. 20 PCs to generate the clusters.

Step 4.3: Cluster the cells

The basic steps towards complete clustering involve:

  • Measure of similarity: How does one quantify how close two data points are?

  • Quality function: How does one define the clustering/partition of the points?

  • Algorithm: How to find the clustering whereby the quality function is optimized?

Step 4.3.1: Find neighbours

We use a graph-based community detection method where each vertex represents a cell and the weight of the edge measures similarities between two cells. Put simply, we start by building an unweighted K-nearest neighbour (k-NN) graph and add weights to obtain a shared nearest neighbour (SNN) graph. Weight can be added either by finding out the shared nodes (overlap) between two neighbours (more nodes; more similar the neighbours; more weight) or measuring the closeness of both neighbours to each other.

# Compute SNN graph.
seurat_cluster <- FindNeighbors(object = seurat_sct, dims = 1:21) # Based on elbow plot.
## Computing nearest neighbor graph
## Computing SNN
# Remove unneeded object.
rm(seurat_sct)

Step 4.3.2: Find clusters

Next, we want to iteratively group the neighbours together with the goal of combining the cells with similar transcriptomes together. Without knowing the number of clusters a priori, we use a range of resolution based on the approximation of the recommendation where datasets of 3,000 - 5,000 cells should set the resolution between 0.4 - 1.4. We have approximately 100,000 cells, so our range will start from 0.2 - 1.0.

# Determine start and end resolution range.
start_reso_range = 0.2
end_reso_range = 1
range_interval = 0.2

# Find clusters given the range.
seurat_cluster <- FindClusters(object = seurat_cluster,
                               resolution = seq(from = start_reso_range,
                                                to = end_reso_range,
                                                by = range_interval))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 133367
## Number of edges: 4582477
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9738
## Number of communities: 17
## Elapsed time: 60 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 133367
## Number of edges: 4582477
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9581
## Number of communities: 25
## Elapsed time: 61 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 133367
## Number of edges: 4582477
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9466
## Number of communities: 31
## Elapsed time: 61 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 133367
## Number of edges: 4582477
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9368
## Number of communities: 34
## Elapsed time: 60 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 133367
## Number of edges: 4582477
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9277
## Number of communities: 40
## Elapsed time: 58 seconds
# Remove unneeded objects.
rm(start_reso_range, end_reso_range, range_interval)

We can look at the maximum number of clusters found at each resolution with the output below.

# Create a dummy table to contain the max cluster for each resolution.
maxclust = c()
clustoutput = seurat_cluster@meta.data %>% subset(select = grep("SCT_snn_", colnames(.)))
clustoutput[] <- lapply(clustoutput, function(x) as.numeric(as.character(x)))

# Iterate and print the maximum cluster count for each resolution.
for (i in 1:ncol(clustoutput)){
  maxclust[i] = max(clustoutput[, i])
  print(paste0("Max n-cluster for ", 
               names(clustoutput[i]), ": ", 
               maxclust[i]))
}
## [1] "Max n-cluster for SCT_snn_res.0.2: 16"
## [1] "Max n-cluster for SCT_snn_res.0.4: 24"
## [1] "Max n-cluster for SCT_snn_res.0.6: 30"
## [1] "Max n-cluster for SCT_snn_res.0.8: 33"
## [1] "Max n-cluster for SCT_snn_res.1: 39"
# Remove unneeded objects.
rm(clustoutput, i, maxclust)

We can visualize the difference clusterings of the resolutions side by side. In this case, we will look at these resolutions: 0.2, 0.4, 0.6.

# Select resolutions to visualize.
resolutions = c("0.2", "0.4", "0.6")
feature_cols = paste0("SCT_snn_res.", resolutions)

# Visualize UMAP of different resolutions.
for (reso in feature_cols) {
  Idents(seurat_cluster) <- reso
  p <- DimPlot(seurat_cluster,
               reduction = "umap",
               label = TRUE,
               label.size = 3,
               raster = FALSE) + 
    ggtitle(reso) +
    NoLegend()
  print(p)
}

# Remove unneeded objects.
rm(feature_cols, p, reso, resolutions)

Step 4.4: Explore clustering output

Let’s start with the exploration with the resolution: 0.6.

# Sort the cluster labels in ascending order.
reso_columns = grep("SCT_snn_", colnames(seurat_cluster@meta.data), value = TRUE)
metadata <- seurat_cluster@meta.data
for (col in colnames(metadata)) {
  if (col %in% reso_columns){
    metadata[, col] <- factor(metadata[, col], levels = paste(sort(as.integer(levels(metadata[, col])))))
  }
}
seurat_cluster@meta.data <- metadata

# Assign desired cluster resolution as identity of Seurat.
Idents(object = seurat_cluster) <- "SCT_snn_res.0.6"

# Visualize cluster with a UMAP.
DimPlot(seurat_cluster,
        reduction = "umap",
        label = TRUE,
        label.size = 3,
        raster = FALSE) + 
  ggtitle("SCT_snn_res.0.6") +
  NoLegend()

# Remove unneeded objects.
rm(col, metadata, reso_columns)

Step 4.5: Explore clustering quality control

Step 4.5.1: Distribution of cells by treatment

# Extract number of cells per cluster.
n_cells <- FetchData(seurat_cluster,
                     vars = c("ident", "treatment")) %>%
  dplyr::count(ident, treatment) 

# Visualize distribution of cells by cluster in a barplot.
ggplot(n_cells, aes(x = ident, y = n, fill = treatment)) +
  geom_bar(position = position_dodge(), 
           stat = "identity") +
  geom_text(aes(label = n), 
            size = 2, 
            vjust = -0.2, 
            position = position_dodge(1)) +
  theme_classic() + 
  theme(axis.text.x = element_text(size = 5))

# Visualize distribution of treatment in each cluster.
DimPlot(seurat_cluster, 
        label = TRUE, 
        split.by = "treatment",
        label.size = 3,
        raster = FALSE) + 
  theme(axis.text.x = element_text(size = 6),
        axis.text.y = element_text(size = 6)) +
  NoLegend()

# Remove unneeded object.
rm(n_cells)

Step 4.5.2: Mitochondrial enrichment in clusters

We would like to see if there is any enrichment of mitochondria in our clusters and if we need to regress out the mitochondrial level in our data.

# Plot mitoRatio to see if any cluster is enriched in mitochondrial level.
FeaturePlot(seurat_cluster, 
            reduction = "umap", 
            features = "mitoRatio",
            cols = c("#d6cfc7", "#01013f"),
            pt.size = 0.5, 
            order = TRUE,
            min.cutoff = "q10",
            label = TRUE,
            label.size = 3,
            label.color = "#960000",
            raster = FALSE) 

Step 4.5.3: Variation in clusters due to cell cycle phase

If our cell clusters show large differences in cell cycle expression, we might want to regress out the effects of cell cycle phase.

# Explore whether clusters segregate by cell cycle phase.
DimPlot(seurat_cluster,
        label = TRUE, 
        split.by = "Phase",
        label.size = 4,
        raster = FALSE) + 
  theme(axis.text.x = element_text(size = 6),
        axis.text.y = element_text(size = 6)) +
  NoLegend()

We don’t see any noticeable cell cycle phase effects in our clusters.

Step 5: Cell Typing

Step 5.1: Exploration of cluster identities

Step 5.1.1: Explore PC metagenes

We can explore if any specific PCs drive the separation of the clusters.

# Define columns in the metadata to visualize.
columns = c(paste0("PC_", 1:12), "ident", "umap_1", "umap_2")

# Extract data from Seurat object based on column names.
pc_data <- FetchData(seurat_cluster, vars = columns)

# Add cluster label to the centre of cluster on UMAP. 
umap_label = FetchData(seurat_cluster, 
                       vars = c("ident", "umap_1", "umap_2")) %>%
  group_by(ident) %>%
  summarise(x = mean(umap_1), y = mean(umap_2))

# Plot a UMAP plot for each of the PCs.
map(paste0("PC_", 1:12), function(pc){
  ggplot(pc_data, aes(umap_1, umap_2)) +
    geom_point(aes_string(color = pc), alpha = 0.7) +
    scale_color_gradient(guide = "none", low = "#d6cfc7", high = "#01013f") +
    geom_text(data = umap_label, aes(label = ident, x, y), color = "#960000", size = 4) +
    theme_classic() +
    ggtitle(pc)}) %>%
  plot_grid(plotlist = .)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Remove unneeded objects.
rm(columns, pc_data, umap_label)

We can print the genes driving different PCs and correlate those metagenes with the identity of the clusters.

# Print PC metagenes.
print(seurat_cluster[["pca"]], dims = 1:12, nfeatures = 5)
## PC_ 1 
## Positive:  Mgp, Inmt, Gsn, Bgn, Ogn 
## Negative:  Lyz2, Il1b, Cd74, S100a9, Cxcl2 
## PC_ 2 
## Positive:  Lyz2, Sftpc, Sftpb, Sftpa1, Mgp 
## Negative:  Ptprb, Calcrl, Cldn5, Epas1, Lyve1 
## PC_ 3 
## Positive:  Sftpc, Sftpb, Sftpa1, Cxcl15, Sftpd 
## Negative:  Igkc, Cd74, Ighm, Cd79a, Ccl5 
## PC_ 4 
## Positive:  Ccl5, Gzma, Igkc, Cd74, Prf1 
## Negative:  S100a9, S100a8, Il1b, Cxcl2, Acod1 
## PC_ 5 
## Positive:  Lyz2, Sftpc, Sftpa1, Sftpb, Chil3 
## Negative:  Sec14l3, Cbr2, Tmem212, Ccdc153, Cyp2f2 
## PC_ 6 
## Positive:  Lyz2, Chil3, Spp1, Ccl6, Ctss 
## Negative:  Ccl5, Gzma, S100a9, S100a8, Prf1 
## PC_ 7 
## Positive:  Igkc, Cd74, Ighm, Cd79a, H2-Aa 
## Negative:  Ccl5, Gzma, Prf1, Nkg7, Lyz2 
## PC_ 8 
## Positive:  Postn, Acta2, Myh11, Gucy1a1, Tagln 
## Negative:  Inmt, Gsn, Mgp, Mfap4, Gzma 
## PC_ 9 
## Positive:  Cbr2, Postn, Sftpa1, Sftpb, Cxcl15 
## Negative:  Ager, Cldn18, Akap5, Rtkn2, Sec14l3 
## PC_ 10 
## Positive:  Acta2, Tagln, Myh11, Myl9, Cnn1 
## Negative:  Postn, Gucy1a1, Pdzd2, Gucy1b1, Inmt 
## PC_ 11 
## Positive:  Ccl21a, Ccl21b, Mmrn1, Ccl21d, Gm10591 
## Negative:  Acta2, Tagln, Myh11, Cd74, Gzma 
## PC_ 12 
## Positive:  Dcn, Mgp, Col3a1, Col1a1, Col1a2 
## Negative:  Inmt, Npnt, Ogn, Fmo2, Pcolce2

Step 5.2: Explore cell identities with canonical markers

Step 5.2.1: Explore AT1 clusters

The SCTransformed data was performed only on the 3000 most variable genes. Ideally, we want to explore the identity of the clusters with all the genes in the dataset and not just the top 3000 most variable genes, so we need to set the default assay back to the RNA assay.

# Set default assay back to RNA.
DefaultAssay(seurat_cluster) <- "RNA"

We can explore canonical cell type markers of our cell types of interest and visualize their expression on our UMAP. Canonical markers are obtained from LungMAP and A molecular cell atlas of the human lung from single-cell RNA sequencing (2020) Nature

AT1 cells are defined by the cell markers: Hopx, Ager, Rtkn2, Pdpn and Clic5. We can visualize both the UMAP and Dotplot to see which cluster(s) do these markers localize.

# Visualize AT1 markers expression on dotplot.
DotPlot(seurat_cluster,
        features = c("Hopx", "Ager", "Rtkn2", "Pdpn", "Clic5"),
        cluster.idents = TRUE,
        cols = c("#d6cfc7", "#01013f"),
        dot.scale = 6,
        scale.by = "radius") +
  ggtitle("AT1 markers") +
  theme(plot.title = element_text(hjust = 0.5))

Step 5.2.2: Explore AT2 clusters

AT2 cells are defined by the cell markers: Sftpb, Sftpc, Sftpd, Muc1, Etv5 and Lamp3.

# Visualize AT2 markers expression on dotplot.
DotPlot(seurat_cluster,
        features = c("Sftpb", "Sftpc", "Sftpd", "Muc1", "Etv5", "Lamp3"),
        cluster.idents = TRUE,
        cols = c("#d6cfc7", "#01013f"),
        dot.scale = 6,
        scale.by = "radius") +
  ggtitle("AT2 markers") +
  theme(plot.title = element_text(hjust = 0.5))

Step 5.2.3: Explore fibroblast clusters

Alveolar fibroblast cells are defined by the cell markers: Pdgfra, Tcf21, Wnt2, Mfap5, Col1a1.

Pericytes are defined by the cell markers: Pdgfrb, Trpc6, Cspg4.

# Visualize alveolar fibroblast markers expression on dotplot.
DotPlot(seurat_cluster,
        features = c("Tcf21", "Wnt2", "Mfap5", "Pdgfra", "Pdgfrb", "Trpc6", "Col1a1"),
        cluster.idents = TRUE,
        cols = c("#d6cfc7", "#01013f"),
        dot.scale = 6,
        scale.by = "radius") +
  ggtitle("Fibroblast markers") +
  theme(plot.title = element_text(hjust = 0.5))

Step 5.2.4: Explore myofibroblast clusters

Myofibroblast cells are defined by the cell markers: Wt1, Fgf18, Dach2, Frem2, Eln, Acta2.

# Visualize myofibroblast markers expression on dotplot.
DotPlot(seurat_cluster,
        features = c("Wt1", "Fgf18", "Dach2", "Frem2", "Acta2"),
        cluster.idents = TRUE,
        cols = c("#d6cfc7", "#01013f"),
        dot.scale = 6,
        scale.by = "radius") +
  ggtitle("Myofibroblast markers") +
  theme(plot.title = element_text(hjust = 0.5))

Step 5.3: Cell type annotation

Step 5.3.1: Reference-based Azimuth annotation

Azimuth uses a reference-based mapping with the Human Biomolecular Atlas Project, containing references for multiple human tissues.

# Check all available Azimuth references.
available_data <- AvailableData()
available_data[grep("Azimuth", available_data[, 3]), 1:3] # Use lungref.
##                                  Dataset Version                        Summary
## adiposeref.SeuratData         adiposeref   1.0.0     Azimuth Reference: adipose
## bonemarrowref.SeuratData   bonemarrowref   1.0.0  Azimuth Reference: bonemarrow
## fetusref.SeuratData             fetusref   1.0.0       Azimuth Reference: fetus
## heartref.SeuratData             heartref   1.0.0       Azimuth Reference: heart
## humancortexref.SeuratData humancortexref   1.0.0 Azimuth Reference: humancortex
## kidneyref.SeuratData           kidneyref   1.0.2      Azimuth Reference: kidney
## lungref.SeuratData               lungref   2.0.0        Azimuth Reference: lung
## mousecortexref.SeuratData mousecortexref   1.0.0 Azimuth Reference: mousecortex
## pancreasref.SeuratData       pancreasref   1.0.0    Azimuth Reference: pancreas
## pbmcref.SeuratData               pbmcref   1.0.0        Azimuth Reference: pbmc
## tonsilref.SeuratData           tonsilref   2.0.0      Azimuth Reference: tonsil
# Run Azimuth with lungref.
options(timeout = 1000)
seurat_annotated <- RunAzimuth(seurat_cluster, reference = "lungref", assay = "RNA")
## Warning: Overwriting miscellanous data for model
## Warning: Adding a dimensional reduction (refUMAP) without the associated assay
## being present
## detected inputs from MOUSE with id type Gene.name
## reference rownames detected HUMAN with id type Gene.name
## Found 17502 out of 34407 total inputs in conversion table
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Normalizing query using reference SCT model
## Warning: No layers found matching search pattern provided
## Warning: 822 features of the features specified were not present in both the reference query assays. 
## Continuing with remaining 2178 features.
## Projecting cell embeddings
## Finding query neighbors
## Finding neighborhoods
## Finding anchors
##  Found 27534 anchors
## Finding integration vectors
## Finding integration vector weights
## Predicting cell labels
## Predicting cell labels
## Predicting cell labels
## Predicting cell labels
## Predicting cell labels
## Predicting cell labels
## 
## Integrating dataset 2 with reference dataset
## Finding integration vectors
## Integrating data
## Warning: Keys should be one or more alphanumeric characters followed by an
## underscore, setting key from integrated_dr_ to integrateddr_
## Computing nearest neighbors
## Running UMAP projection
## Warning in RunUMAP.default(object = neighborlist, reduction.model =
## reduction.model, : Number of neighbors between query and reference is not equal
## to the number of neighbors within reference
## 12:54:55 Read 133367 rows
## 12:54:55 Processing block 1 of 1
## 12:54:55 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 20
## 12:54:55 Initializing by weighted average of neighbor coordinates using 1 thread
## 12:54:56 Commencing optimization for 67 epochs, with 2667340 positive edges
## 12:55:09 Finished
## Warning: No assay specified, setting assay as RNA by default.
## Projecting reference PCA onto query
## Finding integration vector weights
## Projecting back the query cells into original PCA space
## Finding integration vector weights
## Computing scores:
##     Finding neighbors of original query cells
##     Finding neighbors of transformed query cells
##     Computing query SNN
##     Determining bandwidth and computing transition probabilities
## Total elapsed time: 1.6905783812205
# Remove unneeded object.
rm(available_data)
# Assign desired cluster resolution as identity of Seurat.
Idents(object = seurat_annotated) <- "predicted.ann_finest_level"

# Visualize annotated cluster with a UMAP.
options(ggrepel.max.overlaps = Inf) 
DimPlot(seurat_annotated,
        reduction = "umap",
        group.by = "predicted.ann_finest_level",
        label = TRUE,
        label.size = 4,
        repel = TRUE,
        raster = FALSE) + 
  ggtitle("Azimuth annotation") +
  theme(plot.title = element_text(hjust = 0.5)) +
  NoLegend()

# Visualize annotation prediction score.
FeaturePlot(seurat_annotated, 
            reduction = "umap", 
            features = "mapping.score", 
            cols = c("#01013f", "#59974d", "#fff000"),
            raster = FALSE) +
  theme(axis.text.x = element_text(size = 8),
        axis.text.y = element_text(size = 8)) +
  ggtitle("Annotation prediction score")

# Get metadata.
annotation_meta = seurat_annotated@meta.data

# Subset the required columns.
annotation_meta <- annotation_meta %>% subset(select = c(SCT_snn_res.0.6, predicted.ann_finest_level, mapping.score))

# Group the table by cluster and annotation.
annotation_meta <- annotation_meta %>% 
  dplyr::rename(cluster = "SCT_snn_res.0.6", 
                cluster_annotation = "predicted.ann_finest_level") %>%
  group_by(cluster, cluster_annotation) %>% 
  summarise(mean_mapping_score = mean(mapping.score))
## `summarise()` has grouped output by 'cluster'. You can override using the
## `.groups` argument.
# Visualize the probable annotations for each cluster.
annotation_meta <- annotation_meta %>% arrange(cluster, desc(mean_mapping_score)) %>% slice_head(n = 3)
annotation_meta
## # A tibble: 91 × 3
## # Groups:   cluster [31]
##    cluster cluster_annotation    mean_mapping_score
##    <fct>   <chr>                              <dbl>
##  1 0       T cells proliferating              1    
##  2 0       Plasma cells                       0.985
##  3 0       Plasmacytoid DCs                   0.867
##  4 1       EC venous pulmonary                0.792
##  5 1       Smooth muscle                      0.747
##  6 1       EC general capillary               0.667
##  7 2       Plasma cells                       0.992
##  8 2       B cells                            0.767
##  9 2       Lymphatic EC mature                0.289
## 10 3       Lymphatic EC mature                0.985
## # ℹ 81 more rows
# Remove unneeded object.
rm(annotation_meta)

Step 5.3.2: Marker-based ScType annotation

We can cross-check the reference-based annotation with the ScType annotation for validation.

# Load in ScType functions.
source("/Users/kendrix/Documents/Work/UnityHealth/Data_Projects/auto_detect_tissue_type_KK.R")
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R")
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")

# Set path to ScType reference database.
db_ = "https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_full.xlsx"

# Check tissue type for validation.
tissue_guess <- auto_detect_tissue_type(path_to_db_file = db_, 
                                        seuratObject = seurat_cluster, 
                                        scaled = TRUE, 
                                        assay = "SCT") 
## [1] "Checking...Immune system"
## Selecting by scores
## [1] "Checking...Pancreas"
## Selecting by scores
## [1] "Checking...Liver"
## Selecting by scores
## [1] "Checking...Eye"
## Selecting by scores
## [1] "Checking...Kidney"
## Selecting by scores
## [1] "Checking...Brain"
## Selecting by scores
## [1] "Checking...Lung"
## Selecting by scores
## [1] "Checking...Adrenal"
## Selecting by scores
## [1] "Checking...Heart"
## Selecting by scores
## [1] "Checking...Intestine"
## Selecting by scores
## [1] "Checking...Muscle"
## Selecting by scores
## [1] "Checking...Placenta"
## Selecting by scores
## [1] "Checking...Spleen"
## Selecting by scores
## [1] "Checking...Stomach"
## Selecting by scores
## [1] "Checking...Thymus"
## Selecting by scores
## [1] "Checking...Hippocampus"
## Selecting by scores

# Define Seurat object.
seuobj = seurat_cluster

# Specify tissue of interest.
tissue = "Lung"

# Prepare gene sets. 
gs_list <- gene_sets_prepare(db_, tissue)
    
# Check Seurat object version - count matrix is extracted differently between versions.
seurat_package_v5 <- isFALSE("counts" %in% names(attributes(seuobj[["RNA"]])))
print(sprintf("Seurat object %s is used", ifelse(seurat_package_v5, "v5", "v4")))
## [1] "Seurat object v5 is used"
# Extract scaled scRNAseq matrix.
scRNAseqData_scaled <- if (seurat_package_v5) as.matrix(seuobj[["RNA"]]$scale.data) else 
  as.matrix(seuobj[["RNA"]]@scale.data)
    
# Run ScType.
es.max <- sctype_score(scRNAseqData = scRNAseqData_scaled, 
                       scaled = TRUE, 
                       gs = gs_list$gs_positive, 
                       gs2 = gs_list$gs_negative)
    
# Merge ScType results by clusters.
cL_results <- do.call("rbind", 
                      lapply(unique(seuobj@meta.data$seurat_clusters), 
                             function(cl){es.max.cl = sort(
                               rowSums(
                                 es.max[, rownames(
                                   seuobj@meta.data[
                                     seuobj@meta.data$seurat_clusters == cl, ])]),
                               decreasing = !0)
                             head(data.frame(cluster = cl,
                                             type = names(es.max.cl),
                                             scores = es.max.cl,
                                             ncells = sum(seuobj@meta.data$seurat_clusters == cl)), 10)
                             }))
sctype_scores <- cL_results %>% group_by(cluster) %>% top_n(n = 1, wt = scores)
    
# Set low-confident (low ScType score) clusters to "Unknown".
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] <- "Unknown"
print(sctype_scores[, 1:3])
## # A tibble: 40 × 3
## # Groups:   cluster [40]
##    cluster type                             scores
##    <fct>   <chr>                             <dbl>
##  1 3       Unknown                           1810.
##  2 33      Endothelial cell                   307.
##  3 22      Endothelial cell                  7807.
##  4 0       Unknown                           1455.
##  5 20      Endothelial cell                  6153.
##  6 5       Fibroblasts                      21080.
##  7 4       Alveolar macrophages             52124.
##  8 1       Endothelial cell                 20266.
##  9 12      Pulmonary alveolar type II cells 61301.
## 10 17      Fibroblasts                      11350.
## # ℹ 30 more rows
# Add annotated cell types to the metadata.
seurat_cluster@meta.data$sctype_annotation = ""

for(j in unique(sctype_scores$cluster)){
  cl_type = sctype_scores[sctype_scores$cluster==j, ]
  seurat_cluster@meta.data$sctype_annotation[seurat_cluster@meta.data$seurat_clusters == j] = as.character(cl_type$type[1])
}
# Plot UMAP with ScType annotation.
DimPlot(seurat_cluster, 
        reduction = "umap", 
        label = TRUE, 
        label.size = 3,
        repel = TRUE, 
        group.by = "sctype_annotation",
        raster = FALSE) + 
  NoLegend()

# Remove unneeded objects.
rm(auto_detect_tissue_type, cL_results, cl_type, db_, es.max,
   gene_sets_prepare, gs_list, j, scRNAseqData_scaled, sctype_score,
   sctype_scores, seuobj, seurat_package_v5, tissue, tissue_guess)

Step 5.4: Cross-check and collapse annotated cell types

Step 5.4.1: Add the Azimuth annotation to original seurat object

# Add metadata from seurat_annotated, which contains the Azimuth annotation and ScType annotation to seurat_cluster.
seurat_cluster@meta.data <- seurat_annotated@meta.data

# Get Ensembl annotation data.
# Connect to AnnotationHub.
ah <- AnnotationHub()

# Access the Ensembl database for organism.
ahDb <- query(ah, 
              pattern = c("Mus musculus", "EnsDb"), 
              ignore.case = TRUE)

# Acquire the latest annotation files.
id <- ahDb %>%
  mcols() %>%
  rownames() %>%
  tail(n = 1)

# Download the appropriate Ensembldb database.
edb <- ah[[id]]
## loading from cache
# Extract gene-level information from database.
annotations <- genes(edb, return.type = "data.frame")

# Select annotations of interest.
annotations <- annotations %>%
  dplyr::select(gene_id, gene_name, seq_name, gene_biotype, description)

# Remove unneeded objects.
rm(ah, ahDb, edb, id)

Step 5.4.2: Remove cells that are identified as nasal or mesothelium

# Check unique cell type labels.
unique(seurat_cluster$predicted.ann_finest_level)
##  [1] "B cells"                      "EC venous systemic"          
##  [3] "EC venous pulmonary"          "EC aerocyte capillary"       
##  [5] "Alveolar fibroblasts"         "Non-classical monocytes"     
##  [7] "EC arterial"                  "AT2"                         
##  [9] "Adventitial fibroblasts"      "Smooth muscle"               
## [11] "CD8 T cells"                  "Interstitial Mφ perivascular"
## [13] "CD4 T cells"                  "Classical monocytes"         
## [15] "DC2"                          "Lymphatic EC differentiating"
## [17] "Monocyte-derived Mφ"          "AT1"                         
## [19] "Club (non-nasal)"             "EC general capillary"        
## [21] "Pericytes"                    "Multiciliated (non-nasal)"   
## [23] "Transitional Club-AT2"        "SM activated stress response"
## [25] "Myofibroblasts"               "NK cells"                    
## [27] "Peribronchial fibroblasts"    "Mast cells"                  
## [29] "Club (nasal)"                 "Fibromyocytes"               
## [31] "Plasmacytoid DCs"             "T cells proliferating"       
## [33] "Lymphatic EC mature"          "Ionocyte"                    
## [35] "Basal resting"                "Suprabasal"                  
## [37] "AT2 proliferating"            "Migratory DCs"               
## [39] "Plasma cells"                 "Alveolar Mφ proliferating"   
## [41] "Lymphatic EC proliferating"   "Neuroendocrine"              
## [43] "Deuterosomal"                 "Goblet (nasal)"              
## [45] "SMG duct"                     "Goblet (bronchial)"          
## [47] "Alveolar macrophages"         "SMG serous (nasal)"
# Remove any cells that are annotated as nasal.
grep("nasal", seurat_cluster$predicted.ann_finest_level, value = TRUE) %>% unique()
## [1] "Club (non-nasal)"          "Multiciliated (non-nasal)"
## [3] "Club (nasal)"              "Goblet (nasal)"           
## [5] "SMG serous (nasal)"
# Club (nasal), Goblet (nasal)
dim(seurat_cluster)[2] # 133,367
## [1] 133367
seurat_cluster <- subset(seurat_cluster, 
                         subset = predicted.ann_finest_level %in% c("Club (nasal)",
                                                                    "Goblet (nasal)",
                                                                    "SMG serous (nasal)"),
                         invert = TRUE)
dim(seurat_cluster)[2] # 133,251 (116 cells removed)
## [1] 133251
# Remove any cells that are labelled mesothelial cells as they are pleural and are not specialized lung cells.  
grep("Mesothelium", seurat_cluster$predicted.ann_finest_level, value = TRUE) %>% unique() 
## character(0)
# None

Step 5.4.3: Collapse cell type annotation

# Pull cell metadata from the Seurat object.
metadata <- seurat_cluster@meta.data

# Aggregate some annotated cell types.
# Find all fibroblasts and aggregate them together.
grep("fibroblast", metadata$predicted.ann_finest_level, value = TRUE) %>% unique()
## [1] "Alveolar fibroblasts"      "Adventitial fibroblasts"  
## [3] "Myofibroblasts"            "Peribronchial fibroblasts"
# Alveolar fibroblasts, Adventitial fibroblasts, Peribronchial fibroblasts
metadata <- mutate(metadata, 
                   predicted.ann_finest_level = 
                     recode(predicted.ann_finest_level, 
                            "Alveolar fibroblasts" = "Fibroblasts",
                            "Adventitial fibroblasts" = "Fibroblasts",
                            "Peribronchial fibroblasts" = "Fibroblasts"))

# Find all endothelial cells and aggregate them together.
grep("EC", metadata$predicted.ann_finest_level, value = TRUE) %>% unique()
## [1] "EC venous systemic"           "EC venous pulmonary"         
## [3] "EC aerocyte capillary"        "EC arterial"                 
## [5] "Lymphatic EC differentiating" "EC general capillary"        
## [7] "Lymphatic EC mature"          "Lymphatic EC proliferating"
# EC arterial, EC venous systemic, EC general capillary, EC aerocyte capillary, Lymphatic EC differentiating, EC venous pulmonary, Lymphatic EC mature, Lymphatic EC proliferating.
metadata <- mutate(metadata, 
                   predicted.ann_finest_level = 
                     recode(predicted.ann_finest_level, 
                            "EC general capillary" = "Endothelial capillary",
                            "EC aerocyte capillary" = "Endothelial capillary",
                            "EC arterial" = "Endothelial arterial",
                            "EC venous systemic" = "Endothelial venous",
                            "EC venous pulmonary" = "Endothelial venous"))

metadata <- mutate(metadata, 
                   predicted.ann_finest_level = 
                     recode(predicted.ann_finest_level, 
                            "Lymphatic EC mature" = "Endothelial lymphatic",
                            "Lymphatic EC differentiating" = "Endothelial lymphatic",
                            "Lymphatic EC proliferating" = "Endothelial lymphatic"))

# Find all smooth muscle cells and aggregate them together.
grep("Smooth|SM", metadata$predicted.ann_finest_level, value = TRUE) %>% unique()
## [1] "Smooth muscle"                "SM activated stress response"
## [3] "SMG duct"
# Smooth muscle, SM activated stress response.
metadata <- mutate(metadata, 
                   predicted.ann_finest_level = 
                     recode(predicted.ann_finest_level, 
                            "Smooth muscle" = "VSMCs",
                            "SM activated stress response" = "VSMCs"))

# Find all immune cells and aggregate them together.
metadata$predicted.ann_finest_level[metadata$predicted.ann_level_1 == "Immune"] %>% unique()
##  [1] "B cells"                      "Non-classical monocytes"     
##  [3] "CD8 T cells"                  "Interstitial Mφ perivascular"
##  [5] "CD4 T cells"                  "Classical monocytes"         
##  [7] "DC2"                          "Monocyte-derived Mφ"         
##  [9] "NK cells"                     "Endothelial capillary"       
## [11] "Mast cells"                   "Plasmacytoid DCs"            
## [13] "T cells proliferating"        "Fibroblasts"                 
## [15] "Migratory DCs"                "Plasma cells"                
## [17] "Alveolar Mφ proliferating"    "Transitional Club-AT2"       
## [19] "Endothelial venous"           "Pericytes"                   
## [21] "AT2"                          "Multiciliated (non-nasal)"   
## [23] "Endothelial arterial"         "AT1"                         
## [25] "AT2 proliferating"            "VSMCs"                       
## [27] "Ionocyte"                     "Alveolar macrophages"        
## [29] "Endothelial lymphatic"
# Alveolar macrophages, Alveolar Mφ proliferating, B cells, CD4 T cells, CD8 T cells, Classical monocytes, DC2, Interstitial Mφ perivascular, Mast cells, Migratory DCs, Monocyte-derived Mφ, NK cells, Non-classical monocytes, Plasma cells, Plasmacytoid DCs, T cells proliferating.
metadata <- mutate(metadata, 
                   predicted.ann_finest_level = 
                     recode(predicted.ann_finest_level, 
                            "Alveolar macrophages" = "Immune",
                            "Alveolar Mφ proliferating" = "Immune",
                            "B cells" = "Immune",
                            "CD4 T cells" = "Immune",
                            "CD8 T cells" = "Immune",
                            "Classical monocytes" = "Immune",
                            "DC2" = "Immune",
                            "Interstitial Mφ perivascular" = "Immune",
                            "Mast cells" = "Immune",
                            "Migratory DCs" = "Immune",
                            "Monocyte-derived Mφ" = "Immune",
                            "NK cells" = "Immune",
                            "Non-classical monocytes" = "Immune",
                            "Plasma cells" = "Immune",
                            "Plasmacytoid DCs" = "Immune",
                            "T cells proliferating" = "Immune"))

# Get metadata column and rename.
metadata = (metadata %>%
              rename("predicted.ann_finest_level" = "cell_type"))

# Add edited metadata back to Seurat object.
seurat_cluster@meta.data <- metadata

# Set default idents of Seurat object.
Idents(seurat_cluster) <- "cell_type"

Step 5.5: Final cell type annotation

# Set colour scheme based on annotated cell type.
celltype_col = c("AT1" = "#2f4f4f",
                 "AT2" = "#00bfff",
                 "Basal resting" = "#2e8b57",
                 "Club (non-nasal)" = "#8b008b",
                 "Endothelial arterial" = "#ffa500",
                 "Endothelial capillary" = "#808000",
                 "Endothelial lympathic" = "#00008b",
                 "Endothelial venous" = "#00ff00",
                 "Fibroblasts" = "#ff4500",
                 "Immune" = "#f08080",
                 "Ionocyte" = "#f0e68c",
                 "Multiciliated (non-nasal)" = "#ff00ff",
                 "Myofibroblasts" = "#c71585",
                 "Neuroendocrine" = "#00fa9a",
                 "Pericytes" = "#ff1493",
                 "Suprabasal" = "#eee8aa",
                 "Transitional Club-AT2" = "#9370db",
                 "VSMCs" = "#00ffff")

# Visualize annotated cluster with a UMAP.
DimPlot(seurat_cluster,
        reduction = "umap",
        group.by = "cell_type",
        cols = celltype_col,
        label = TRUE,
        label.size = 4,
        repel = TRUE,
        raster = FALSE) + 
  ggtitle("Final cell type annotation") +
  theme(plot.title = element_text(hjust = 0.5)) 

Step 6: Save Seurat objects.

save(seurat_cluster, 
     file = "/Users/kendrix/Documents/Work/UnityHealth/Data_Projects/scRNAseq_LungFibrosisAnalysis/GSE264278/data/preprocessed_seurat_obj.RData")

References

Analysis codes are adapted from HBCtraining and Ouyang Lab with credits going to the original authors of the publications cited in the book. Reference materials are adapted from SVI Bioinformatics and Cellular Genomics Lab with credits going to the original authors.